from urllib.request import ProxyHandler, build_opener

import requests
from fake_useragent import UserAgent
from random import randint
from time import sleep
import re


def get_html(url):
    headers = {
        "User-Agent": UserAgent().random
    }
    # s随机睡眠3-10秒
    sleep(randint(5, 10))

    response = requests.get(url, headers=headers)

    response.encoding = 'utf-8'
    # print(response.text)
    if response.status_code == 200:
        # return response.text
        return response
    else:
        return None


def parse_index(html):
    # \d+匹配一个或多个数字
    # all_url = re.findall(r'<a href="(/films/\d+)" target="_blank" data-act="movies-click" data-val="{movieId:\d+}">',html)
    all_url = re.findall(r' <a href="(/films/\d+)" target="_blank" data-act="movies-click" data-val="{movieId:\d+}">',
                         html)
    return ['http://maoyan.com{}'.format(url) for url in all_url]


def parse_info(html):
    name = re.findall('<h1 class="name">(.+)</h1>', html)[0]
    types = re.findall('<li class="ellipsis">(.+)</li>', html)[0]

    actors = re.findall(
        '<li class="celebrity actor".+>\s+<a href="/films/cel.+>\s+<img.+>\s+</a>\s+<div.+>\s+<a.+>\s+(.+)\s+</a>',
        html)
    actors = format_actors(actors)
    return {
        "name": name,
        "type": types,
        "actor": actors
    }


def format_actors(actors):
    actor_set = set()
    for actor in actors:
        actor_set.add(actor)
    return actor_set


def main():
    # index_url = 'https://maoyan.com/films'
    index_url = 'https://maoyan.com/films?catId=2&sourceId=2&showType=3'
    html = get_html(index_url)
    movie_urls = parse_index(html)
    for url in movie_urls:
        movie_html = get_html(url)
        movie = parse_info(movie_html)
        print(movie)

    print(movie_urls)


if __name__ == '__main__':
    main()
